package com.asiainfo.zqx;

import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class Meirijingjixinwen extends BreadthCrawler {
    private String seedurl = "http://fxcj.nbd.com.cn/";

    public Meirijingjixinwen(String crawlPath) {
        super(crawlPath, false);
        addSeed(new CrawlDatum(seedurl, "list"));
        setThreads(1);
        addRegex("http://www.nbd.com.cn/articles/.*/.*.html");
     //    setResumable(true);
    }

    @Override
    public void visit(Page page, CrawlDatums crawlDatums) {
        String contentType = page.contentType();
        if (contentType == null) {
            return;
        }
        if (page.matchType("text")) {
            System.out.println("连接为" + page.url());
            System.out.println("标题为"+page.select("div.g-article-top>h1").text());
            System.out.println("来源为"+page.select("div.g-article-top>p.u-time>span.source").text());
            System.out.println("时间为"+page.select("div.g-article-top>p.u-time>span.time").text());
             String writer=page.select("div.g-articl-text>p").first().text();
//            System.out.println("writer:"+writer);
            writer = writer.replaceAll(" ","");
            if(writer.contains("每经记者")&&writer.contains("每经编辑")){
                 writer = writer.substring(writer.indexOf("每经记者") + 4, writer.indexOf("每经编辑")).trim();
                writer=writer.replaceAll(" ","").replaceAll("    ","").replaceAll("西安","");
            }else if (writer.contains("每经编辑")){
                     writer=writer.substring(writer.indexOf("每经编辑")+4,writer.length()).replaceAll(" ","").replaceAll("nbsp","");
                     writer=writer.replaceAll("    ","").replaceAll("西安","");
             }else {
                 writer= "每经网";
             }
            System.out.println("阅读量为"+Integer.parseInt(page.meta("read").replaceAll(" ","").trim()));
            System.out.println("作者为"+writer);
            System.out.println("正文为"+page.select("div.g-articl-text").text());
        } else if (page.matchType("list")&&page.matchUrl(page.url())) {

            Elements elements = page.select("div.g-columnnews>ul>li");

            for (int i = 0; i < elements.size(); i++) {
                Element element = elements.get(i);
                String url = element.select("div.u-columnnews-text>div.u-content>a").attr("abs:href");
                String str = element.select("p.f-source>span").get(2).text();
                str = str.substring(0, str.length() - 2);
                crawlDatums.add(new CrawlDatum(url, "text").meta("read", str));
            }
        }
    }

    public static void main(String[] args) throws Exception {
        Meirijingjixinwen mei = new Meirijingjixinwen("mei");
        mei.addRegex("http://www.nbd.com.cn/articles/.*/.*.html");
        mei.start(2);
    }
}
